1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(pvclust)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/macrogen-samples.csv"),
                      col_types=cols(.default=col_factor()))

JH: I use the Expression Profile.gene.txt from StringTie and I select only the Feature_GID and the read_count columns of each sample.

I transform the column Feature_GID into the row names => this will be my ‘counts’ object

ND I added a filter to remove small RNA (“misc_RNA”, “rRNA”, “snRNA”, “snoRNA”,“tRNA” )

counts <- suppressWarnings(suppressMessages(read_tsv(here("data/macrogen/ALB_RNAseq_for_Nico/R_analysis_JH/salmon/Expression_Profile.GCF_006496715.gene.txt"), 
                   col_names = TRUE, col_types = cols(.default=col_guess())) %>% 
  filter(Type %in% c("protein_coding","pseudogene","lncRNA","transcribed_pseudogene")) %>% 
  select(c("Feature_GID",ends_with("Read_Count")))  %>% 
  column_to_rownames("Feature_GID")))

Sanity check to ensure that the data is sorted according to the sample info

stopifnot(all(colnames(counts) == samples$Sample_ID))

1.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "23.6% percent (8931) of 37866 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by CHANGEME
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat,aes(x,y,fill=Virus)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

ggplot(dat,aes(x,y,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

ggplot(dat,aes(x,y,fill=Cells)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage shows a very high shoulder on the right side.

A lot of genes are lowly expressed. This is probably indicating too high a sequencing depth.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 8931 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by the different meta-data.

There is no obvious deviation between samples

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Virus=samples$Virus[match(ind,samples$Sample_ID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$Sample_ID)]) %>%
  mutate(Cells=samples$Cells[match(ind,samples$Sample_ID)])

ggplot(dat,aes(x=values,group=ind,col=Virus)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).

ggplot(dat,aes(x=values,group=ind,col=Cells)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).

1.2 Export

dir.create(here("data/analysis_macrogen/bioQA"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis_macrogen/bioQA/raw-unormalised-gene-expression_data.csv"))

2 Data normalisation

2.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

## JH: I do not have txi so I use DESeqDataSetFromMatrix instead
## JH: I block the effect of the Cells in the design, as we do not want yet to model it in the DEG 1 to 4

dds <- DESeqDataSetFromMatrix(
  counts,
  colData = samples,
  design = ~ Cells + Virus * Time
)
## converting counts to integer mode
save(dds,file=here("data/analysis_macrogen/bioQA/dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively uniform -15% / + 30% of the average

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
9_Read_Count 10_Read_Count 11_Read_Count 13_Read_Count 14_Read_Count
1.125 0.9863 0.9059 0.9279 0.996
Table continues below
15_Read_Count 37_Read_Count 39_Read_Count 40_Read_Count 41_Read_Count
1.084 0.8832 1.003 0.9704 1.02
Table continues below
42_Read_Count 43_Read_Count 45_Read_Count 46_Read_Count 47_Read_Count
0.9899 1.292 1.08 0.8196 0.9273
49_Read_Count 50_Read_Count 52_Read_Count
1.046 1.251 0.861
boxplot(sizes, main="Sequencing libraries size factor")

Assess whether there might be a difference in library size linked to a given metadata

We should keep an eye on the Virus=yes samples as they seem to have a slightly deeper sequencing depth than the no counterpart.

boxplot(split(sizes,dds$Virus),las=2,
        main="Sequencing libraries size factor by treatment with Virus")

boxplot(split(sizes,dds$Time),las=2,
        main="Sequencing libraries size factor by Time")

boxplot(split(sizes,dds$Cells),las=2,
        main="Sequencing libraries size factor by Cell liness")

It does not seem all so bad, a couple samples are outliers for the “yes” class.

plot(sizes,log10(colSums(counts(dds))),ylab="log10 raw depth",xlab="scaling factor",
     col=rainbow(n=nlevels(dds$Virus))[as.integer(dds$Virus)],pch=19)
legend("bottomright",fill=rainbow(n=nlevels(dds$Virus)),
       legend=levels(dds$Virus),cex=0.6)

2.2 Variance Stabilising Transformation

For visualisation, in addition to the correction for the sequencing depth above, we want to correct for the mean-variance relationship that exists in RNA-Seq data, hence we apply a transformation to render the data homoscedastic

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately, impressively so even.

meanSdPlot(vst[rowSums(vst)>0,])

2.3 QC on the normalised data

2.3.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=3

An the number of possible combinations

nlevel=nlevels(dds$Virus) * nlevels(dds$Time) * nlevels(dds$Cells)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

A lot of the variance is explained by the first components, while all samples are needed to explain all the variance in the data

The inflection point arise early (4) after which the increase is linearly constant

All these are promising.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Removed 3 row(s) containing missing values (geom_path).

2.3.2 2D

The first components separates the cell types. The second is the time within C6.36

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Cells,text=Sample_ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

2.3.3 Sequencing depth

Number of genes expressed per condition at different cutoffs

Here we try to assess whether there would be any link between sequencing depth and the variable of interest.

It would seem not to be the case or limited to highly expressed genes in the case of a Virus yes vs. no comparison

conds <- factor(paste(dds$Cells,dds$Time))
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=dds$Virus,
                                nrep=6)

2.3.4 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 3 y values <= 0 omitted from
## logarithmic plot

vst.cutoff <- 1
  • Heatmap of “all” genes The heatmap is showing the same as the PCA, Time and Cells are preponderant
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = paste(conds,dds$Virus),
          col=hpal)

There are a few interesting things here. U4.4 further separates based on virus presence.

C6.36 does not at 24h and neither does it at 6h unless there would be a sample swap…

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=paste(conds,dds$Virus))

2.3.5 Hierarchical clustering

Done to assess the previous dendrogram’s reproducibility

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                       method.hclust = "ward.D2", 
                       nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.

There is little doubt about the clustering, the au score is always close to the maximum (100 for a 0-100 range)

plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)

2.4 Conclusion

The data is of good quality. It might have been sequenced too deeply.

The data might not satisfy the expected study design, as the Virus effect is minimal and only appears in the U4.4 cell type.

A putative sample swap for the C6.36 cells at 6h could be investigated, but it might just be a random effect due to the little variance effect created by the Virus variable overall.

3 Session Info

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.58.0                  tximport_1.18.0            
##  [3] forcats_0.5.1               stringr_1.4.0              
##  [5] dplyr_1.0.6                 purrr_0.3.4                
##  [7] readr_1.4.0                 tidyr_1.1.3                
##  [9] tibble_3.1.1                tidyverse_1.3.1            
## [11] pvclust_2.2-0               plotly_4.9.4               
## [13] pander_0.6.3                hyperSpec_0.99-20201127    
## [15] xml2_1.3.2                  ggplot2_3.3.3              
## [17] lattice_0.20-41             here_1.0.1                 
## [19] gplots_3.1.1                DESeq2_1.30.1              
## [21] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [23] MatrixGenerics_1.2.1        matrixStats_0.59.0         
## [25] GenomicRanges_1.42.0        GenomeInfoDb_1.26.7        
## [27] IRanges_2.24.1              S4Vectors_0.28.1           
## [29] BiocGenerics_0.36.1         data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-1       ellipsis_0.3.2         rprojroot_2.0.2       
##  [4] XVector_0.30.0         fs_1.5.0               rstudioapi_0.13       
##  [7] hexbin_1.28.2          farver_2.1.0           affyio_1.60.0         
## [10] bit64_4.0.5            AnnotationDbi_1.52.0   fansi_0.5.0           
## [13] lubridate_1.7.10       splines_4.0.3          cachem_1.0.5          
## [16] geneplotter_1.68.0     knitr_1.33             jsonlite_1.7.2        
## [19] broom_0.7.6            annotate_1.68.0        dbplyr_2.1.1          
## [22] png_0.1-7              BiocManager_1.30.15    compiler_4.0.3        
## [25] httr_1.4.2             backports_1.2.1        assertthat_0.2.1      
## [28] Matrix_1.3-3           fastmap_1.1.0          lazyeval_0.2.2        
## [31] limma_3.46.0           cli_2.5.0              htmltools_0.5.1.1     
## [34] tools_4.0.3            affy_1.68.0            gtable_0.3.0          
## [37] glue_1.4.2             GenomeInfoDbData_1.2.4 Rcpp_1.0.6            
## [40] cellranger_1.1.0       jquerylib_0.1.4        vctrs_0.3.8           
## [43] preprocessCore_1.52.1  crosstalk_1.1.1        xfun_0.22             
## [46] rvest_1.0.0            testthat_3.0.2         lifecycle_1.0.0       
## [49] gtools_3.9.2           XML_3.99-0.6           zlibbioc_1.36.0       
## [52] scales_1.1.1           hms_1.1.0              RColorBrewer_1.1-2    
## [55] yaml_2.2.1             memoise_2.0.0          sass_0.4.0            
## [58] latticeExtra_0.6-29    stringi_1.6.2          RSQLite_2.2.7         
## [61] highr_0.9              genefilter_1.72.1      caTools_1.18.2        
## [64] BiocParallel_1.24.1    rlang_0.4.11           pkgconfig_2.0.3       
## [67] bitops_1.0-7           evaluate_0.14          labeling_0.4.2        
## [70] htmlwidgets_1.5.3      bit_4.0.4              tidyselect_1.1.1      
## [73] magrittr_2.0.1         R6_2.5.0               generics_0.1.0        
## [76] DelayedArray_0.16.3    DBI_1.1.1              pillar_1.6.1          
## [79] haven_2.4.1            withr_2.4.2            survival_3.2-10       
## [82] RCurl_1.98-1.3         modelr_0.1.8           crayon_1.4.1          
## [85] KernSmooth_2.23-18     utf8_1.2.1             rmarkdown_2.8         
## [88] jpeg_0.1-8.1           locfit_1.5-9.4         readxl_1.3.1          
## [91] blob_1.2.1             reprex_2.0.0           digest_0.6.27         
## [94] xtable_1.8-4           munsell_0.5.0          viridisLite_0.4.0     
## [97] bslib_0.2.5.1